# Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.cluster import KMeans, DBSCAN
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import metrics
import plotly.figure_factory as ff
# Load dataset
merged_data = pd.read_csv('merged_train.csv')
# Visualize dataset
merged_data.head()
# Task 1
# Partition dataset into training and validationg sets using 80-20 split
x_train, x_val, y_train, y_val = train_test_split(merged_data.drop(['State', 'County', 'FIPS', 'Democratic', 'Republican', 'Party'], axis=1), merged_data[['Democratic', 'Republican', 'Party']], test_size=0.2, random_state=0)
print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)
# Visualize training set: predictor variables
x_train.head()
# Visualize training set: labels
y_train.head()
# Task 2
# Standardize the training set and the validation set
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_val_scaled = scaler.transform(x_val)
# Task 3
# Predicting the number of votes cast for the DEMOCRATIC party in each county
# Building Model using all variables
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled, y_train['Democratic'])
print(fitted_model.coef_)
predicted = fitted_model.predict(x_val_scaled)
corr_coef = numpy.corrcoef(predicted, y_val['Democratic'])[1, 0]
R_squared = corr_coef**2
print(R_squared)
# Building Model using the top 4 most significant variables (based on project01 report)
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled[:, [0, 1, 2, 11]], y_train['Democratic'])
print(fitted_model.coef_)
predicted = fitted_model.predict(x_val_scaled[:, [0, 1, 2, 11]])
corr_coef = numpy.corrcoef(predicted, y_val['Democratic'])[1, 0]
R_squared = corr_coef**2
print(R_squared)
# Removing seemingly least relevant variable (BEST MODEL FOR PREDICTING DEMOCRATIC VOTES)
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled[:, [0, 2, 11]], y_train['Democratic'])
print(fitted_model.coef_)
predicted = fitted_model.predict(x_val_scaled[:, [0, 2, 11]])
corr_coef = numpy.corrcoef(predicted, y_val['Democratic'])[1, 0]
R_squared = corr_coef**2
print(R_squared)
# Adding multiple other variables to see if R squared can be increased
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled[:, [0, 2, 5, 9, 11, 12]], y_train['Democratic'])
print(fitted_model.coef_)
predicted = fitted_model.predict(x_val_scaled[:, [0, 2, 5, 9, 11, 12]])
corr_coef = numpy.corrcoef(predicted, y_val['Democratic'])[1, 0]
R_squared = corr_coef**2
print(R_squared)
# Predicting the number of votes cast for the REPUBLICAN party in each county
# Building Model using all variables
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled, y_train['Republican'])
print(fitted_model.coef_)
predicted = fitted_model.predict(x_val_scaled)
corr_coef = numpy.corrcoef(predicted, y_val['Republican'])[1, 0]
R_squared = corr_coef**2
print(R_squared)
# Building Model using the top 4 most significant variables (based on project01 report)
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled[:, [0, 1, 2, 11]], y_train['Republican'])
print(fitted_model.coef_)
predicted = fitted_model.predict(x_val_scaled[:, [0, 1, 2, 11]])
corr_coef = numpy.corrcoef(predicted, y_val['Republican'])[1, 0]
R_squared = corr_coef**2
print(R_squared)
# Removing seemingly least relevant variable
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled[:, [0, 1, 11]], y_train['Republican'])
print(fitted_model.coef_)
predicted = fitted_model.predict(x_val_scaled[:, [0, 1, 11]])
corr_coef = numpy.corrcoef(predicted, y_val['Republican'])[1, 0]
R_squared = corr_coef**2
print(R_squared)
# Adding more variables that seem relevant to republican counties to see if R squared can be increased
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled[:, [0, 1, 7, 8, 12]], y_train['Republican'])
print(fitted_model.coef_)
predicted = fitted_model.predict(x_val_scaled[:, [0, 1, 7, 8, 12]])
corr_coef = numpy.corrcoef(predicted, y_val['Republican'])[1, 0]
R_squared = corr_coef**2
print(R_squared)
# Removing some variables to reduce overfitting (BEST model for predicting Republican)
model = linear_model.LinearRegression()
fitted_model = model.fit(x_train_scaled[:, [0, 1, 12]], y_train['Republican'])
print(fitted_model.coef_)
predicted = fitted_model.predict(x_val_scaled[:, [0, 1, 12]])
corr_coef = numpy.corrcoef(predicted, y_val['Republican'])[1, 0]
R_squared = corr_coef**2
print(R_squared)
#Task 4
#Decision Trees using different criterions
#Classify into two parties so only get the parties for each variable
#Using only the 4 significant variables found in project 1
classifier_y_train = y_train['Party']
classifier_y_test = y_val['Party']
#EntropyClassifier
Eclassifier = DecisionTreeClassifier(criterion="entropy")
Eclassifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
y_pred = Eclassifier.predict(x_val_scaled[:, [0, 1, 2, 11]])
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Entropy Decision Tree Confusion Matrix')
plt.tight_layout()
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
Gclassifier = DecisionTreeClassifier(criterion="gini")
Gclassifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
y_pred = Gclassifier.predict(x_val_scaled[:, [0, 1, 2, 11]])
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Gini Decision Tree Confusion Matrix')
plt.tight_layout()
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
#Entropy is slightly better here because even though Gini has a better accuracy by 0.01 it has a F1 score less by 0.03
#K-nearest Neighbors
#Chose the best number of neighbors based of accuracy
neighbors = [2,3,5,10,15,25,50]
prevAccuracy = 0.0
myN = 0
for n in neighbors:
classifier = KNeighborsClassifier(n_neighbors = n)
classifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:, [0, 1, 2, 11]])
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
if(accuracy > prevAccuracy):
prevAccuracy = accuracy
myN = n
print(myN)
#Best model is with 3 neighbors
#Compute statistics
classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:, [0, 1, 2, 11]])
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('K=3 nearest neigbhors Confusion Matrix')
plt.tight_layout()
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
#SVMs
#First try to find the best kernel
classifier = SVC(kernel = 'linear')
classifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:, [0, 1, 2, 11]])
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('SVM linear kernel Confusion Matrix')
plt.tight_layout()
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
#SVMs
#First try to find the best kernel
classifier = SVC(kernel = 'poly')
classifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:, [0, 1, 2, 11]])
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('SVM poly kernel Confusion Matrix')
plt.tight_layout()
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
#SVMs
#First try to find the best kernel
classifier = SVC(kernel = 'rbf')
classifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:, [0, 1, 2, 11]])
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('SVM rbf kernel Confusion Matrix')
plt.tight_layout()
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
#SVMs
#First try to find the best kernel
classifier = SVC(kernel = 'sigmoid')
classifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:, [0, 1, 2, 11]])
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('SVM sigmoid kernel Confusion Matrix')
plt.tight_layout()
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
# Rbf is the best kernel
# Now try to find the best c value
# based on accuracy
values = [0.001,0.01,0.1,0.5,1.0,5.0,10]
prevAcc = 0.0
cVal = -1
for v in values:
classifier = SVC(kernel = 'rbf', C = v)
classifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:, [0, 1, 2, 11]])
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
if(accuracy > prevAcc):
prevAcc = accuracy
cVal = v
print(cVal)
#best c val is 5.0
# get metrics
classifier = SVC(kernel = 'rbf',C=5.0)
classifier.fit(x_train_scaled[:, [0, 1, 2, 11]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:, [0, 1, 2, 11]])
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('SVM rbf,c=5.0 kernel Confusion Matrix')
plt.tight_layout()
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
#Now that we found the optimal models for each type of classifier
#lets try a different combination of variables for the best models for each classifier
#combination: Percent under age 25,median household income, percent unemployed, percent less than bach degree, percent roral
# [:,[6,8,9,11,12]]
#Best decision tree model on new combination
Eclassifier = DecisionTreeClassifier(criterion="entropy")
Eclassifier.fit(x_train_scaled[:,[6,8,9,11,12]],classifier_y_train)
y_pred = Eclassifier.predict(x_val_scaled[:,[6,8,9,11,12]])
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Entropy Decision , newCombo Tree Confusion Matrix')
plt.tight_layout()
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
#The new decision tree is much worse than the previous combination
#Create K -nearest neighbors model with new combination
classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(x_train_scaled[:,[6,8,9,11,12]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:,[6,8,9,11,12]])
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('3 neighbors classifier, new Combo Confusion Matrix')
plt.tight_layout()
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
# K nearest neighbors also is worse with the new combination of variables
#Create SVM with new combination of variables
classifier = SVC(kernel = 'rbf',C=5.0)
classifier.fit(x_train_scaled[:,[6,8,9,11,12]],classifier_y_train)
y_pred = classifier.predict(x_val_scaled[:,[6,8,9,11,12]])
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('SVM rbf,c=5.0, new Combo Confusion Matrix')
plt.tight_layout()
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
# This new model also performs worst.
#Lets create models with all the variables and see if we get improvement
#Decision tree all variables
Eclassifier = DecisionTreeClassifier(criterion="entropy")
Eclassifier.fit(x_train_scaled,classifier_y_train)
y_pred = Eclassifier.predict(x_val_scaled)
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Entropy Decision Tree, all variables Confusion Matrix')
plt.tight_layout()
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
#Even though accuracy increased by 0.01, F1 score decreasd by 0.04 so the model with 4 variables is better here
#K-nearest neighbors for all variables
classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(x_train_scaled,classifier_y_train)
y_pred = classifier.predict(x_val_scaled)
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('3 neighbors classifier, all variables Confusion Matrix')
plt.tight_layout()
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
# This model is also worse than the original model
#SVM all variables
classifier = SVC(kernel = 'rbf',C=5.0)
classifier.fit(x_train_scaled,classifier_y_train)
y_pred = classifier.predict(x_val_scaled)
conf_matrix = metrics.confusion_matrix(classifier_y_test,y_pred)
sns.heatmap(conf_matrix,annot=True,fmt=".3f",square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('SVM rbf,c=5.0, all variables Confusion Matrix')
plt.tight_layout()
accuracy = metrics.accuracy_score(classifier_y_test,y_pred)
error = 1 - accuracy
precision = metrics.precision_score(classifier_y_test,y_pred,average = None)
recall = metrics.recall_score(classifier_y_test,y_pred,average=None)
F1_score = metrics.f1_score(classifier_y_test,y_pred,average = None)
print([accuracy,error,precision,recall,F1_score])
# This model is actually better than both other models
#The best performing model is K nearest neighbors with 3 neighbors on original combination of variables because even though
#there are other models with similiar accuracy K-nearest neighbors has around 0.07 increase in F-score compared to the other models
#Task 5
# Choosing the variables
X = merged_data[['Total Population', 'Percent White, not Hispanic or Latino', 'Percent Black, not Hispanic or Latino',"Percent Less than Bachelor's Degree"]]
Y = merged_data['Party']
#Hierarchical Clustering
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
clustering = linkage(X_scaled,method = "single",metric = "euclidean")
clusters = fcluster(clustering,2,criterion = 'maxclust')
cont_matrix = metrics.cluster.contingency_matrix(merged_data["Party"],clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()
adjusted_rand_index = metrics.adjusted_rand_score(merged_data['Party'], clusters)
silhouette_coefficient = metrics.silhouette_score(X, clusters, metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])
# Plot clusters found using hierarchical clustering with single linkage method
# data['clusters'] = clusters
ax = merged_data.plot(kind = 'scatter', x = 'Total Population', y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Percent White, not Hispanic or Latino', y = 'Percent Black, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = "Percent Less than Bachelor's Degree", c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Total Population', y = "Percent Less than Bachelor's Degree", c = 'Party', colormap = plt.cm.brg)
# Complete Linkage using euclidean
clustering = linkage(X, method = "complete", metric = "euclidean")
clusters = fcluster(clustering, 2, criterion = 'maxclust')
cont_matrix = metrics.cluster.contingency_matrix(merged_data['Party'],clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()
adjusted_rand_index = metrics.adjusted_rand_score(merged_data['Party'], clusters)
silhouette_coefficient = metrics.silhouette_score(X, merged_data['Party'], metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
#KMeans Clustering iteration = 1
clustering = KMeans(n_clusters = 2, init = 'random', n_init = 1, random_state = 0).fit(X)
clusters = clustering.labels_
cont_matrix = metrics.cluster.contingency_matrix(merged_data['Party'],clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()
adjusted_rand_index = metrics.adjusted_rand_score(merged_data['Party'], clusters)
silhouette_coefficient = metrics.silhouette_score(X, merged_data['Party'], metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
#KMeans incresing the number of iteration
clustering = KMeans(n_clusters = 2, init = 'k-means++', n_init = 10).fit(X)
# clustering = KMeans(n_clusters = 4, init = 'random', n_init = 20, random_state = 0).fit(X)
clusters = clustering.labels_
cont_matrix = metrics.cluster.contingency_matrix(merged_data['Party'],clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()
adjusted_rand_index = metrics.adjusted_rand_score(merged_data['Party'], clusters)
silhouette_coefficient = metrics.silhouette_score(X, merged_data['Party'], metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
#DBSCAN
clustering = DBSCAN(eps = 1, min_samples = 5, metric = "euclidean").fit(X)
clusters = clustering.labels_
cont_matrix = metrics.cluster.contingency_matrix(merged_data['Party'],clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()
adjusted_rand_index = metrics.adjusted_rand_score(merged_data['Party'], clusters)
silhouette_coefficient = metrics.silhouette_score(X, merged_data['Party'], metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
# DBSCAN with different eps and min_sample values and metrics
clustering = DBSCAN(eps = 5, min_samples = 500, metric = "manhattan").fit(X)
clusters = clustering.labels_
cont_matrix = metrics.cluster.contingency_matrix(merged_data['Party'],clusters)
sns.heatmap(cont_matrix, annot = True, fmt = ".3f", square = True, cmap = plt.cm.Blues)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Contingency matrix')
plt.tight_layout()
adjusted_rand_index = metrics.adjusted_rand_score(merged_data['Party'], clusters)
silhouette_coefficient = metrics.silhouette_score(X, merged_data['Party'], metric = "euclidean")
print([adjusted_rand_index, silhouette_coefficient])
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
# Evaluation metrics
silhouette_coefficient = metrics.silhouette_score(X, merged_data['Party'], metric = "euclidean")
print(silhouette_coefficient)
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = 'Percent Black, not Hispanic or Latino', y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Percent White, not Hispanic or Latino', c = 'Party', colormap = plt.cm.brg)
ax = merged_data.plot(kind = 'scatter', x = "Percent Less than Bachelor's Degree", y = 'Total Population', c = 'Party', colormap = plt.cm.brg)
#Task 6
test_data = pd.read_csv('demographics_test.csv')
new_train = merged_data.drop(['State', 'County', 'FIPS', 'Democratic', 'Republican', 'Party'], axis=1)
new_train_val = merged_data['Party']
scaler = StandardScaler()
scaler.fit(new_train)
new_train_scaled = scaler.transform(new_train)
new_test_data = test_data.drop(['State', 'County', 'FIPS'], axis=1)
scaler = StandardScaler()
scaler.fit(new_test_data)
new_test_data_scaled = scaler.transform(new_test_data)
new_test_data.head()
#Train best classification model on whole merged data with the best combination of variables and get prediction of the test data
classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(new_train_scaled[:, [0, 1, 2, 11]],new_train_val)
y_pred = classifier.predict(new_test_data_scaled[:, [0, 1, 2, 11]])
fips = test_data['FIPS'].tolist()
values = y_pred.tolist()
fig = ff.create_choropleth(fips=fips, values=values, colorscale=['rgb(255, 0, 0)', 'rgb(0, 0, 255)'], title='US Counties Political Map', legend_title='1 = Democratic, 0 = Republican')
fig.layout.template = None
fig.show()
# Task 7
merged_data.head()
new_train_scaled
model = linear_model.LinearRegression()
democratic_regression_model = model.fit(new_train_scaled[:, [0, 2, 11]], merged_data['Democratic'])
print(democratic_regression_model.coef_)
democratic_regression_predicted = democratic_regression_model.predict(new_test_data_scaled[:, [0, 2, 11]])
# print(predicted.size)
# corr_coef = numpy.corrcoef(predicted, merged_data['Democratic'])[1, 0]
# R_squared = corr_coef**2
# print(R_squared)
model = linear_model.LinearRegression()
republican_regression_model = model.fit(new_train_scaled[:, [0, 1, 12]], merged_data['Republican'])
print(republican_regression_model.coef_)
republican_regression_predicted = republican_regression_model.predict(new_test_data_scaled[:, [0, 1, 12]])
# print(predicted.size)
# corr_coef = numpy.corrcoef(predicted, y_val['Republican'])[1, 0]
# R_squared = corr_coef**2
# print(R_squared)
classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(new_train_scaled[:, [0, 1, 2, 11]],new_train_val)
y_pred = classifier.predict(new_test_data_scaled[:, [0, 1, 2, 11]])
# print(y_pred.size)
names = test_data['State'].tolist()
counties = test_data['County'].tolist()
republican_regression_predicted_list = republican_regression_predicted.tolist()
democratic_regression_predicted_list = democratic_regression_predicted.tolist()
labels_list = y_pred.tolist()
print(names)
print(counties)
print(republican_regression_predicted)
output_data = {'State':names,'County':counties,'Democratic':democratic_regression_predicted_list,'Republican':republican_regression_predicted_list,'Party':labels_list}
output_dataFrame = pd.DataFrame(output_data,columns =['State','County','Democratic','Republican','Party'])
output_dataFrame.head()
output_dataFrame.to_csv("Output.csv")